## ----------------------------------------------------------
## Code to clean Bukoba study Baseline/Endline surveys
## ----------------------------------------------------------

#### Load data ####
# VP Bukoba Intervention data
parentB <- read.csv("../data/bukoba/PreMain.csv") #parent Pre-intervention survey
parentE <- read.csv("../data/bukoba/PostMain.csv") #parent Post-intervention survey
studentB <- read.csv("../data/bukoba/GradeScale.csv", 
                     stringsAsFactors=FALSE) #student assessment Pre-intervention
studentE <- read.csv("../data/bukoba/GradeScale2.csv", 
                     stringsAsFactors=FALSE) #student assessment Post-intervention

# Primary school data
primary <- read.csv("../data/bukoba/priperformingall.csv") #all primary schools

bukoba <- primary[primary$DISTRICT == "BUKOBA(V)" & #subset to Bukoba schools
                    primary$OWNERSHIP == "GOVERNMENT" &
                    primary$YEAR_OF_RESULT == 2014,]

Bukstudyschools <- read.csv("../data/bukoba/PilotSchoolsInterventionStatus.csv") #our study matched schools

Bukteacherrabsences <- read.csv("../data/bukoba/PostEndline_TeacherAbsences.csv") #teacher absences from Post testing day

  
#### Clean data ####

### Parent survey data
# clean ParentID for Pre
parentB$ParentID <- ifelse(is.na(parentB$ParentID_B) == F, 
                           parentB$ParentID_B, 
                           ifelse(is.na(parentB$ParentID_B2) == F,
                           parentB$ParentID_B2, parentB$ParentID_B3))

# only surveyed parents at Post
parentE <- parentE[parentE$INTSTRT_E == "Yes",]

# Merge parent surveys based on ParentID
parentE$SchoolCatID.post <- parentE$SchoolCatID

repvars <- names(parentE) %in% c("SchoolCatID", "SchoolID") 
parentE <- parentE[!repvars]

parentBuk <- merge(parentB, parentE, by = "ParentID", all.x = T)

# remove parent who passed away and was replaced by relative
parentBuk <- parentBuk[parentBuk$ParentID != 580,]

# Intervention groups
parentBuk$SchoolCatID <- factor(parentBuk$SchoolCatID, 
                            levels=c("Survey Only",
                                     "Info Workshop",
                                     "Validated Participation"))

parentBuk$TreatCat <- case_when(parentBuk$SchoolCatID == "Survey Only"~"SO",
                                parentBuk$SchoolCatID == "Info Workshop"~"IW",
                                parentBuk$SchoolCatID == "Validated Participation"~"VP",
                                TRUE~NA_character_) 
                        
parentBuk$TreatCat <- factor(parentBuk$TreatCat, 
                            levels=c("SO",
                                     "IW",
                                     "VP"))

parentBuk$VP <- ifelse(parentBuk$SchoolCatID == "Validated Participation", 1, 0) #indicator for VP

parentBuk$IW <- ifelse(parentBuk$SchoolCatID == "Info Workshop", 1, 0) #indicator for IW

parentBuk$Mtg <- ifelse(parentBuk$SchoolCatID == "Info Workshop" |
                      parentBuk$SchoolCatID == "Validated Participation", 1, 0) #indicator for VP or IW

# Merge with School level information, including "Match" which is triplet match
parentBuk <- merge(parentBuk, Bukstudyschools[,c("NAME", "WARD", "LONGITUDE", "LATITUDE",
                                                "PASS_RATE", "AVERAGE.PERFORMANCE.2015", "CHANGE_PREVIOUS_YEAR",
                                                "RANK", "PUPIL_TEACHER_RATIO", "totalenroll", 
                                                "Match", "STD.III.TEACHERS", "STD.III.STUDENTS",
                                                "STD.III.PARENTS.ESTMATE")],
                       by.x = "SchoolID", by.y = "NAME", all.x = T)

# survey times, Pre
parentBuk$start_B <- strptime(parentBuk$TIMESTRT_B, format="%d/%m/%Y  %H:%M:%S")
parentBuk$end_B <- strptime(parentBuk$TIMEND_B, format="%d/%m/%Y  %H:%M:%S")

parentBuk$minutes_B <- as.numeric(parentBuk$end_B - parentBuk$start_B)

parentBuk$minutes_B[parentBuk$minutes_B > 100] <- NA

# survey times, Post
parentBuk$start_E <- strptime(parentBuk$TIMESTRT_E, format="%d/%m/%Y  %H:%M:%S")
parentBuk$end_E <- strptime(parentBuk$TIMEND_E, format="%d/%m/%Y  %H:%M:%S")

parentBuk$minutes_E <- as.numeric(parentBuk$end_E - parentBuk$start_E)

parentBuk$minutes_E[parentBuk$minutes_E > 100] <- NA

# Parent gender, Female dummy
parentBuk$Female_B <- ifelse(parentBuk$Gender_B == "Male", 0, 1)

# Parent age bin: 30 and under, 31-40, 41-50, 51-60, above 61
parentBuk$AgeBin <- ifelse(parentBuk$DM1_B <= 30, 1,
                         ifelse(parentBuk$DM1_B > 30 & parentBuk$DM1_B <= 40, 2,
                         ifelse(parentBuk$DM1_B > 40 & parentBuk$DM1_B <= 50, 3,
                         ifelse(parentBuk$DM1_B > 50 & parentBuk$DM1_B <= 60, 4,
                         ifelse(parentBuk$DM1_B > 60, 5, NA))))) 

# Parent household head, dummy
parentBuk$HHhead_B <- ifelse(parentBuk$HH2_B == "I am the head of household", 1, 0)

# Parent education, numeric levels
parentBuk$BC5_B <- factor(parentBuk$BC5_B, 
                            levels=c("No schooling", "Some primary school", 
                                     "Completed primary school", "Some secondary school",
                                     "Completed secondary school", "Some post-secondary school",
                                     "Diploma course/certificate", " University degree",
                                     "Don't know"))

parentBuk$BC5_B_num <- as.numeric(parentBuk$BC5_B)

parentBuk$BC5_B_num_rescaled <- scales::rescale(parentBuk$BC5_B_num, to = c(1, 5))


# Parent relationship to child, dummy parent or other relative
parentBuk$parent_B <- ifelse(parentBuk$CH3_B == "Parent", 1,
                           ifelse(is.na(parentBuk$CH3_B) == F, 0, NA))

# Parent literate, dummy
parentBuk$literate_B <- ifelse(parentBuk$BC6_B == "Yes", 1, 0)

# Parent religion, Islam dummy
parentBuk$muslim_B <- ifelse(parentBuk$BC4_B == "Islam", 1, 0)

# Parent economic index
parentBuk$econindexSUM_B <- apply(parentBuk[, c("EI1_B", "EI2_B",
                "EI3_B", "EI4_B",
                "EI5_B", "EI6_B",
                "EI7_B", "EI8_B",
                "EI9_B", "EI10_B")], 1,FUN = function(x) length(which(x =='Yes')))

parentBuk$econindexSUM_B_rescaled <- scales::rescale(parentBuk$econindexSUM_B, to = c(1, 5))

# Parent poor, dummy (4 items is the median, below is poor)
parentBuk$poor_B <- ifelse(parentBuk$econindexSUM_B < 4, 1,
                    ifelse(parentBuk$econindexSUM_B >= 4, 0,
                           NA))

# Parent political, dummy
parentBuk$political_B <- ifelse(parentBuk$BC8_B == "Yes", 1,
                            ifelse(parentBuk$BC8_B == "No", 0,
                             NA))
# Child disabilities, factor
parentBuk$disabilities_B <- ifelse(parentBuk$CH4_B == 1, "Seeing",
                            ifelse(parentBuk$CH4_B == 2, "Hearing",
                            ifelse(parentBuk$CH4_B == 3, "Communicating",                                   
                            ifelse(parentBuk$CH4_B == 4, "Movement",
                            ifelse(parentBuk$CH4_B == 5, "Daily Life",
                            ifelse(parentBuk$CH4_B == 6, "Intellectual",
                            ifelse(parentBuk$CH4_B == 7, "Learning",
                            ifelse(parentBuk$CH4_B == 8, "Psychiatric",
                            ifelse(parentBuk$CH4_B == 9, "Autism",
                            ifelse(parentBuk$CH4_B == 10, "Albinism",
                            ifelse(parentBuk$CH4_B == 11, "Other",
                            ifelse(parentBuk$CH4_B == 12, "None",
                            NA))))))))))))

parentBuk$disabilities_B_bi <- ifelse(parentBuk$CH4_B == 12, 0,
                             ifelse(is.na(parentBuk$CH4_B) == F, 1, NA))

# Parent knows the teacher's name? Binary
parentBuk$CCK1_B_num <- ifelse(parentBuk$CCK1_B == "Yes", 1,
                            ifelse(parentBuk$CCK1_B == "No", 0,
                             NA))

parentBuk$CCK1_E_num <- ifelse(parentBuk$CCK1_E == "Yes", 1,
                            ifelse(parentBuk$CCK1_E == "No", 0,
                             NA))

parentBuk$CCK1_diff <- parentBuk$CCK1_E_num - parentBuk$CCK1_B_num

# Parent believes school's performance relative to district average?
parentBuk$CCK3_B <- factor(parentBuk$CCK3_B, 
                            levels=c("Worse", "About the Same", 
                                     "Better", "Don't know"))

# Parent asked about teacher absence last week?
parentBuk$EQ1_B_num <- ifelse(parentBuk$EQ1_B == "Yes", 1,
                            ifelse(parentBuk$EQ1_B == "No", 0,
                             NA))

parentBuk$EQ1_E_num <- ifelse(parentBuk$EQ1_E == "Yes", 1,
                            ifelse(parentBuk$EQ1_E == "No", 0,
                             NA))

parentBuk$EQ1_diff <- parentBuk$EQ1_E_num - parentBuk$EQ1_B_num

# Parent believes school's performance next year?
parentBuk$EQ2_B <- factor(parentBuk$EQ2_B, 
                            levels=c("It will do worse",
                                     "It will stay about the same",
                                     "It will improve its ranking",
                                     "Don't know"))

parentBuk$EQ2_B_num <- as.numeric(parentBuk$EQ2_B)
parentBuk$EQ2_B_num[parentBuk$EQ2_B_num == max(parentBuk$EQ2_B_num, na.rm = T)] <- NA #change Don't Know into NA

parentBuk$EQ2_E <- factor(parentBuk$EQ2_E, 
                            levels=c("It will do worse",
                                     "It will stay about the same",
                                     "It will improve its ranking",
                                     "Don't know"))

parentBuk$EQ2_E_num <- as.numeric(parentBuk$EQ2_E)
parentBuk$EQ2_E_num[parentBuk$EQ2_E_num == max(parentBuk$EQ2_E_num, na.rm = T)] <- NA

parentBuk$EQ2_diff <- parentBuk$EQ2_E_num - parentBuk$EQ2_B_num

# Parent believes child performance this year?
parentBuk$EQ3_B <- factor(parentBuk$EQ3_B, 
                            levels=c("Very bad", "Bad", "Average",
                                     "Good", "Very good", "Don't know"))

parentBuk$EQ3_B_num <- as.numeric(parentBuk$EQ3_B)
parentBuk$EQ3_B_num[parentBuk$EQ3_B_num == max(parentBuk$EQ3_B_num, na.rm = T)] <- NA

parentBuk$EQ3_E <- factor(parentBuk$EQ3_E, 
                            levels=c("Very bad", "Bad", "Average",
                                     "Good", "Very good", "Don't know"))

parentBuk$EQ3_E_num <- as.numeric(parentBuk$EQ3_E)
parentBuk$EQ3_E_num[parentBuk$EQ3_E_num == max(parentBuk$EQ3_E_num, na.rm = T)] <- NA

parentBuk$EQ3_diff <- parentBuk$EQ3_E_num - parentBuk$EQ3_B_num

# How likely child will complete primary school? 
parentBuk$EQ4_B <- factor(parentBuk$EQ4_B, 
                          levels = c("1 (Definitely NO)", "2", "3", "4", "5 (Definitely YES)"))
parentBuk$EQ4_B_num <- as.numeric(parentBuk$EQ4_B)

parentBuk$EQ4_E <- factor(parentBuk$EQ4_E, 
                          levels = c("1 (Definitely NO)", "2", "3", "4", "5 (Definitely YES)"))
parentBuk$EQ4_E_num <- as.numeric(parentBuk$EQ4_E)
#parentBuk$EQ4_E_num[is.na(parentBuk$EQ4_E) == F] <- 
  #as.numeric(parentBuk$EQ4_E[is.na(parentBuk$EQ4_E) == F]) - 1 #-1 to keep on same scale

parentBuk$EQ4_diff <- parentBuk$EQ4_E_num - parentBuk$EQ4_B_num

# How likely child will complete primary school? skewed, make BINARY
parentBuk$EQ4_B_bi <- case_when(parentBuk$EQ4_B_num < 5~0,
                                parentBuk$EQ4_B_num == 5~1) 

parentBuk$EQ4_E_bi <- case_when(parentBuk$EQ4_E_num < 5~0,
                                parentBuk$EQ4_E_num == 5~1) 

# How likely child will complete secondary school?
parentBuk$EQ5_B <- factor(parentBuk$EQ5_B, 
                          levels = c("1 (Definitely NO)", "2", "3", "4", "5 (Definitely YES)"))
parentBuk$EQ5_B_num <- as.numeric(parentBuk$EQ5_B)

parentBuk$EQ5_E <- factor(parentBuk$EQ5_E, 
                          levels = c("1 (Definitely NO)", "2", "3", "4", "5 (Definitely YES)"))
parentBuk$EQ5_E_num <- as.numeric(parentBuk$EQ5_E)
#parentBuk$EQ5_E_num[is.na(parentBuk$EQ5_E) == F] <- 
#  as.numeric(parentBuk$EQ5_E[is.na(parentBuk$EQ5_E) == F]) - 1

parentBuk$EQ5_diff <- parentBuk$EQ5_E_num - parentBuk$EQ5_B_num #-1 to keep on same scale

# How likely child will complete secondary school? skewed, make BINARY
parentBuk$EQ5_B_bi <- case_when(parentBuk$EQ5_B_num < 5~0,
                                parentBuk$EQ5_B_num == 5~1) 

parentBuk$EQ5_E_bi <- case_when(parentBuk$EQ5_E_num < 5~0,
                                parentBuk$EQ5_E_num == 5~1) 

# Teaching quality of Standard 3 at this school?
parentBuk$EQ6_B <- factor(parentBuk$EQ6_B, 
                            levels=c("Very bad", "Bad", "Average",
                                     "Good", "Very good", "Don't know"))

parentBuk$EQ6_B_num <- as.numeric(parentBuk$EQ6_B)
parentBuk$EQ6_B_num[parentBuk$EQ6_B_num == max(parentBuk$EQ6_B_num, na.rm = T)] <- NA

parentBuk$EQ6_E <- factor(parentBuk$EQ6_E, 
                            levels=c("Very bad", "Bad", "Average",
                                     "Good", "Very good", "Don't know"))

parentBuk$EQ6_E_num <- as.numeric(parentBuk$EQ6_E)
parentBuk$EQ6_E_num[parentBuk$EQ6_E_num == max(parentBuk$EQ6_E_num, na.rm = T)] <- NA

parentBuk$EQ6_diff <- parentBuk$EQ6_E_num - parentBuk$EQ6_B_num

# Should parents monitor teacher attendance?
parentBuk$PP1_B <- factor(parentBuk$PP1_B, 
                            levels=c("No", "Yes, sometimes",
                                     "Yes, always", "Don't know"))

parentBuk$PP1_B_num <- as.numeric(parentBuk$PP1_B)
parentBuk$PP1_B_num[parentBuk$PP1_B_num == max(parentBuk$PP1_B_num, na.rm = T)] <- NA

parentBuk$PP1_E <- factor(parentBuk$PP1_E, 
                            levels=c("No", "Yes, sometimes",
                                     "Yes, always", "Don't know"))

parentBuk$PP1_E_num <- as.numeric(parentBuk$PP1_E)
parentBuk$PP1_E_num[parentBuk$PP1_B_num == max(parentBuk$PP1_E_num, na.rm = T)] <- NA

parentBuk$PP1_diff <- parentBuk$PP1_E_num - parentBuk$PP1_B_num

# Look at homework last week?
parentBuk$PP2_B_num <- ifelse(parentBuk$PP2_B == "Yes", 1,
                            ifelse(parentBuk$PP2_B == "No", 0,
                             NA))

parentBuk$PP2_E_num <- ifelse(parentBuk$PP2_E == "Yes", 1,
                            ifelse(parentBuk$PP2_E == "No", 0,
                             NA))

parentBuk$PP2_diff <- parentBuk$PP2_E_num - parentBuk$PP2_B_num

# Drop off/pick up child last week?
parentBuk$PP3_B_num <- ifelse(parentBuk$PP3_B == "Yes", 1,
                            ifelse(parentBuk$PP3_B == "No", 0,
                             NA))

parentBuk$PP3_E_num <- ifelse(parentBuk$PP3_E == "Yes", 1,
                            ifelse(parentBuk$PP3_E == "No", 0,
                             NA))

parentBuk$PP3_diff <- parentBuk$PP3_E_num - parentBuk$PP3_B_num


# Spoken to child's teacher about performance in past month?
parentBuk$CP1_E_num <- ifelse(parentBuk$CP1_E == "Yes", 1,
                            ifelse(parentBuk$CP1_E == "No", 0,
                             NA))

# Child have breakfast?
parentBuk$CB1_B_num <- ifelse(parentBuk$CB1_B == "Yes", 1,
                            ifelse(parentBuk$CB1_B == "No", 0,
                             NA))

parentBuk$CB1_E_num <- ifelse(parentBuk$CB1_E == "Yes", 1,
                            ifelse(parentBuk$CB1_E == "No", 0,
                             NA))

parentBuk$CB1_diff <- parentBuk$CB1_E_num - parentBuk$CB1_B_num

# Days child DIDN'T work last week (7 minus the current variable of days child worked)
parentBuk$CB2_B <- 7 - parentBuk$CB2_B
parentBuk$CB2_E <- 7 - parentBuk$CB2_E
  
parentBuk$CB2_diff <- parentBuk$CB2_E - parentBuk$CB2_B

# Biggest problem affecting child's school?
parentBuk$PO1_B <- as.character(parentBuk$PO1_B)

parentBuk$PO1_B[parentBuk$PO1_B == "Insufficient educational materials / facilities (desks, books, library)"] <- "Insufficient materials / facilities"

# How many parents have you spoken to outside of meetings?
parentBuk$PN1_E[parentBuk$PN1_E == 6] <- parentBuk$PN2_E[parentBuk$PN2_E == 6] <- parentBuk$PN3_E[parentBuk$PN3_E == 6] <- NA #DK

parentBuk$PN1_E_num <- parentBuk$PN1_E - 1 #start from 0
parentBuk$PN2_E_num <- parentBuk$PN2_E - 1
parentBuk$PN3_E_num <- parentBuk$PN3_E - 1

# SC member?
parentBuk$SC2_B_bi <- ifelse(parentBuk$SC2_B == "Yes", 1,
                          ifelse(parentBuk$SC2_B == "No", 0, NA))

# Education efficacy score, convert to numeric and take means
EEF_B_qs <- c("EEF1_B", "EEF2_B", "EEF3_B", "EEF4_B", 
              "EEF5_B", "EEF6_B", "EEF7_B", "EEF8_B", "EEF9_B") 

EEF_B <- parentBuk[, EEF_B_qs]

EEF_B <- data.frame(lapply(EEF_B, as.character), stringsAsFactors=FALSE)

EEF_B[EEF_B == "5 (VERY Confident)"] <- "5"
EEF_B[EEF_B == "1(NOT AT ALL Confident)"] <- "1"

EEF_B <- data.frame(lapply(EEF_B, as.numeric), stringsAsFactors=FALSE)

parentBuk[, EEF_B_qs] <- EEF_B

parentBuk$EEFscore_B <- rowMeans(EEF_B, na.rm = T) #total score
parentBuk$EEFinternal_B <- rowMeans(EEF_B[, 1:4], na.rm = T) #internal efficacy
parentBuk$EEFexternal_B <- rowMeans(EEF_B[, 5:9], na.rm = T) #external efficacy
parentBuk$EEFindiv_B <- rowMeans(EEF_B[, c(2,3,4,6,7)], na.rm = T) #individual efficacy
parentBuk$EEFgroup_B <- rowMeans(EEF_B[, c(1,5,8,9)], na.rm = T) #group related efficacy

EEF_E_qs <- c("EEF1_E", "EEF2_E", "EEF3_E", "EEF4_E", 
              "EEF5_E", "EEF6_E", "EEF7_E", "EEF8_E", "EEF9_E")

EEF_E <- parentBuk[, EEF_E_qs]

EEF_E <- data.frame(lapply(EEF_E, as.character), stringsAsFactors=FALSE)

EEF_E[EEF_E == "5 (VERY Confident)"] <- "5"
EEF_E[EEF_E == "1 (NOT AT ALL Confident)"] <- "1"

EEF_E <- data.frame(lapply(EEF_E, as.numeric), stringsAsFactors=FALSE)

parentBuk[, EEF_E_qs] <- EEF_E

parentBuk$EEFscore_E <- rowMeans(EEF_E, na.rm = T) #total score
parentBuk$EEFinternal_E <- rowMeans(EEF_E[, 1:4], na.rm = T) #internal efficacy
parentBuk$EEFexternal_E <- rowMeans(EEF_E[, 5:9], na.rm = T) #external efficacy
parentBuk$EEFindiv_E <- rowMeans(EEF_E[, c(2,3,4,6,7)], na.rm = T) #individual efficacy
parentBuk$EEFgroup_E <- rowMeans(EEF_E[, c(1,5,8,9)], na.rm = T) #group related efficacy

parentBuk$EEFscore_diff <- parentBuk$EEFscore_E - parentBuk$EEFscore_B #total score pre-post diff
parentBuk$EEFinternal_diff <- parentBuk$EEFinternal_E - parentBuk$EEFinternal_B 
parentBuk$EEFexternal_diff <- parentBuk$EEFexternal_E - parentBuk$EEFexternal_B 
parentBuk$EEFindiv_diff <- parentBuk$EEFindiv_E - parentBuk$EEFindiv_B 
parentBuk$EEFgroup_diff <- parentBuk$EEFgroup_E - parentBuk$EEFgroup_B 

# use PCA to make a factor analysis EEF variable
EEFpca_B <- prcomp(EEF_B, center = TRUE, scale. = TRUE) 

#plot(EEFpca_B, type = "l") #diagnostics
summary(EEFpca_B) #look at variance for each pca
psych::alpha(EEF_B) #cronbach's alpha

EEFpca_E <- prcomp(na.omit(EEF_E), center = TRUE, scale. = TRUE) 

#plot(EEFpca_E, type = "l") #diagnostics
summary(EEFpca_E)
psych::alpha(EEF_E) #cronbach's alpha

parentBuk$EEFpc1_B <- predict(EEFpca_B, newdata = EEF_B)[,1] 
parentBuk$EEFpc1_E <- predict(EEFpca_E, newdata = EEF_E)[,1] 
parentBuk$EEFpc1_diff <- parentBuk$EEFpc1_E - parentBuk$EEFpc1_B #total score pre-post diff

# Responsibility to educate child, parent or teacher
parentBuk$EEF10_B_num <- ifelse(parentBuk$EEF10_B == "1 (FULLY PARENT)", 1,
                            ifelse(parentBuk$EEF10_B == "7 (FULLY TEACHER)", 7,
                              as.numeric(parentBuk$EEF10_B)))

parentBuk$EEF10_E_num <- ifelse(parentBuk$EEF10_E == "1 (FULLY PARENT)", 1,
                            ifelse(parentBuk$EEF10_E == "7 (FULLY TEACHER)", 7,
                              as.numeric(parentBuk$EEF10_E)))

parentBuk$EEF10_diff <- parentBuk$EEF10_E_num - parentBuk$EEF10_B_num

# General efficacy score, convert to numeric 
GEF_B_qs <- c("GEF1_B", "GEF2_B", "GEF3_B", "GEF4_B", "GEF5_B", 
              "GEF6_B", "GEF7_B", "GEF8_B", "GEF9_B", "GEF10_B")

GEF_B <- parentBuk[, GEF_B_qs]

GEF_B <- data.frame(lapply(GEF_B, as.character), stringsAsFactors=FALSE)

GEF_B[GEF_B == "Completely true"] <- "4"
GEF_B[GEF_B == "Mostly true"] <- "3"
GEF_B[GEF_B == "Somewhat true"] <- "2"
GEF_B[GEF_B == "Not at all true"] <- "1"

GEF_B <- data.frame(lapply(GEF_B, as.numeric), stringsAsFactors=FALSE)

parentBuk$GEFscore_B <- rowSums(GEF_B, na.rm = T)

GEF_E_qs <- c("GEF1_E", "GEF2_E", "GEF3_E", "GEF4_E", "GEF5_E", "GEF6_E", 
              "GEF7_E", "GEF8_E", "GEF9_E", "GEF10_E")

GEF_E <- parentBuk[, GEF_E_qs]

GEF_E <- data.frame(lapply(GEF_E, as.character), stringsAsFactors=FALSE)

GEF_E[GEF_E == "Completely true"] <- "4"
GEF_E[GEF_E == "Mostly true"] <- "3"
GEF_E[GEF_E == "Somewhat true"] <- "2"
GEF_E[GEF_E == "Not at all true"] <- "1"

GEF_E <- data.frame(lapply(GEF_E, as.numeric), stringsAsFactors=FALSE)

parentBuk$GEFscore_E <- rowSums(GEF_E, na.rm = T)
parentBuk$GEFscore_E[parentBuk$GEFscore_E == 0] <- NA

parentBuk$GEFscore_diff <- parentBuk$GEFscore_E - parentBuk$GEFscore_B

# Concerns to pass onto HT?
parentBuk$FU1_E_bi <- ifelse(parentBuk$FU1_E == "Yes", 1,
                          ifelse(parentBuk$FU1_E == "No", 0, NA))

# Concerns to pass onto DEO?
parentBuk$FU2_E_bi <- ifelse(parentBuk$FU2_E == "Yes", 1,
                          ifelse(parentBuk$FU2_E == "No", 0, NA))

# Buy child's test score?
parentBuk$FU3_E[parentBuk$FU3_E == 2] <- 0


### Kiufunza student assessments

# Pre-intervention tests
# Recode strings as numeric points
studentB$ParentID <- studentB$Confrm
studentB[studentB == "1.The whole text (6 point)"] <- "6"
studentB[studentB == "2.Some of the text (3 point)"] <- "3"
studentB[studentB == "3.None of the text (0 point)"] <- "0"

studentB[studentB == "Correct (6 point)"] <- "6"
studentB[studentB == "Incorrect (0 point)"] <- "0"
studentB[studentB == "Correct (1 point)"] <- "1"

studentB[,grep("Q_", names(studentB))] <- data.frame(lapply(studentB[,grep("Q_", names(studentB))], 
                                  as.numeric), stringsAsFactors=FALSE)

# Assessment times
studentB$start_SB <- strptime(studentB$intbeg, format="%d/%m/%Y  %H:%M:%S") #SB is student base
studentB$end_SB <- strptime(studentB$intend, format="%d/%m/%Y  %H:%M:%S")

studentB$minutes_SB <- as.numeric(studentB$end_SB - studentB$start_SB)

# Subject scores
studentB$KFTotal_B <- rowSums(studentB[,grep("Q_", names(studentB))], na.rm = T)

studentB$Kiswahili_B <- rowSums(studentB[,grep("KQ_", names(studentB))], na.rm = T)
studentB$English_B <- rowSums(studentB[,grep("EQ_", names(studentB))], na.rm = T)
studentB$Addition_B <- rowSums(studentB[,grep("AQ_", names(studentB))], na.rm = T)
studentB$Subtraction_B <- rowSums(studentB[,grep("SQ_", names(studentB))], na.rm = T)
studentB$Multiplication_B <- rowSums(studentB[,grep("MQ_", names(studentB))], na.rm = T)
studentB$Division_B <- rowSums(studentB[,grep("DQ_", names(studentB))], na.rm = T)
studentB$Math_B <- rowSums(studentB[, c("Addition_B", "Subtraction_B", 
                                              "Multiplication_B", "Division_B")])

# Post-intervention tests
# Recode strings as numeric points
studentE$ParentID <- studentE$StudentID
studentE$SchoolID <- NULL

studentE[studentE == "1.The whole text (6 point)"] <- "6"
studentE[studentE == "2.Some of the text (3 point)"] <- "3"
studentE[studentE == "3.None of the text (0 point)"] <- "0"

studentE[studentE == "Correct (6 point)"] <- "6"
studentE[studentE == "Incorrect (0 point)"] <- "0"
studentE[studentE == "Correct (1 point)"] <- "1"

studentE[,grep("Q_", names(studentE))] <- data.frame(lapply(studentE[,grep("Q_", names(studentE))], 
                                  as.numeric), stringsAsFactors=FALSE)

# Assessment times
studentE$start_SE <- strptime(studentE$intbeg, format="%d/%m/%Y  %H:%M:%S")
studentE$end_SE <- strptime(studentE$intend, format="%d/%m/%Y  %H:%M:%S")

studentE$minutes_SE <- as.numeric(studentE$end_SE - studentE$start_SE)

# Subject scores
studentE$KFTotal_E <- rowSums(studentE[,grep("Q_", names(studentE))], na.rm = T)

studentE$Kiswahili_E <- rowSums(studentE[,grep("KQ_", names(studentE))], na.rm = T)
studentE$English_E <- rowSums(studentE[,grep("EQ_", names(studentE))], na.rm = T)
studentE$Addition_E <- rowSums(studentE[,grep("AQ_", names(studentE))], na.rm = T)
studentE$Subtraction_E <- rowSums(studentE[,grep("SQ_", names(studentE))], na.rm = T)
studentE$Multiplication_E <- rowSums(studentE[,grep("MQ_", names(studentE))], na.rm = T)
studentE$Division_E <- rowSums(studentE[,grep("DQ_", names(studentE))], na.rm = T)
studentE$Math_E <- rowSums(studentE[, c("Addition_E", "Subtraction_E", 
                                              "Multiplication_E", "Division_E")])

# Family tree outcomes
## make indicator for three schools in which teachers gave students more time
studentB$FamTree_recode <- studentB$FamTree

studentB$FamTree_recode[is.na(studentB$BranchCount) == T & #if assignment is blank, 0
                        is.na(studentB$NameNber) == T] <- 0 

studentB$FamTreeCt <- ifelse(is.na(studentB$BranchCount) == F, #count family members
                              studentB$BranchCount,
                              studentB$NameNber)

studentB$FamTreeCt.so <- studentB$FamTreeCt #take out outliers

studentB$FamTreeCt.so[studentB$FamTreeCt.so > 40] <- NA

studentB$ParentHelp[studentB$ParentHelp == 2] <- NA #did a parent help?

studentB$ParentHelp[is.na(studentB$BranchCount) == T & 
                        is.na(studentB$NameNber) == T &
                    is.na(studentB$ParentHelp) == F] <- 0

studentB$FamTreeCt.so.ph <- studentB$FamTreeCt.so

studentB$FamTreeCt.so.ph[studentB$ParentHelp != 1] <- NA #count only if parent helped

#problematic schools in which teachers didn't follow our instructions: Kasharu (SO), Nyakato (SO) and Kyenge (IW)
studentB$FamTree.schproblem <- ifelse(studentB$SchoolID == "KASHARU PR. SCHOOL" |
                                      studentB$SchoolID == "NYAKATO PR. SCHOOL" |
                                      studentB$SchoolID == "KYENGE PR. SCHOOL", 1, 0)

# Merge main file with student file
repvars <- names(studentB) %in% c("SchoolCatID", "SchoolID") #subset out repeated school vars with parent file
studentB <- studentB[!repvars]

studentBuk <- merge(studentB, studentE, by = "ParentID")

studyBuk <- merge(parentBuk, studentBuk, 
                by = "ParentID", all.x = T)

# Post-Pre student testing scores
studyBuk$KFTotal_diff <- studyBuk$KFTotal_E - studyBuk$KFTotal_B

#summarySE(studyBuk, "KFTotal_diff", "SchoolCatID")

### Post matching weights using CBPS to correct baseline educational efficacy imbalance
cbps <- CBPS(SchoolCatID ~ EEFscore_B, data = studyBuk)
studyBuk$cbpsweights <- cbps$weights

### Subset by treatment group
VPBuk <- studyBuk[studyBuk$SchoolCatID == "Validated Participation",]
IWBuk <- studyBuk[studyBuk$SchoolCatID == "Info Workshop",]
SOBuk <- studyBuk[studyBuk$SchoolCatID == "Survey Only",]

### Subset by Parent Gender
studyBuk.Fem <- studyBuk[studyBuk$Female_B == 1,] #women only
studyBuk.Men <- studyBuk[studyBuk$Female_B == 0,] #men only

# re-weight based on subsets
cbps <- CBPS(SchoolCatID ~ EEFscore_B, data = studyBuk.Fem)
studyBuk.Fem$cbpsweights <- cbps$weights

cbps <- CBPS(SchoolCatID ~ EEFscore_B, data = studyBuk.Men)
studyBuk.Men$cbpsweights <- cbps$weights

### Subset by Child Gender
studyBuk.Girl <- studyBuk[studyBuk$ChildGender == "Female",] #girls only
studyBuk.Boy <- studyBuk[studyBuk$ChildGender == "Male",] #boys only

# re-weight based on subsets
cbps <- CBPS(SchoolCatID ~ EEFscore_B, data = studyBuk.Girl)
studyBuk.Girl$cbpsweights <- cbps$weights

cbps <- CBPS(SchoolCatID ~ EEFscore_B, data = studyBuk.Boy)
studyBuk.Boy$cbpsweights <- cbps$weights


### Subset by Wealth (under 4 items, 4 items or more since mean is 4 items)
studyBuk.Poor <- studyBuk[studyBuk$econindexSUM_B < 4,] #under 4 items
studyBuk.Rich <- studyBuk[studyBuk$econindexSUM_B >= 4,] #4 or more items

# re-weight based on subsets
cbps <- CBPS(SchoolCatID ~ EEFscore_B, data = studyBuk.Poor)
studyBuk.Poor$cbpsweights <- cbps$weights

cbps <- CBPS(SchoolCatID ~ EEFscore_B, data = studyBuk.Rich)
studyBuk.Rich$cbpsweights <- cbps$weights


### Subset by lower baseline efficacy parents
studyBuksub <- studyBuk[studyBuk$EEFscore_B < 4,] #only keep parents whose baseline efficacy was under 4

# re-weight based on sub-sample
cbps <- CBPS(SchoolCatID ~ EEFscore_B, data = studyBuksub)
studyBuksub$cbpsweights <- cbps$weights

VPBuksub <- studyBuksub[studyBuksub$SchoolCatID == "Validated Participation",]
IWBuksub <- studyBuksub[studyBuksub$SchoolCatID == "Info Workshop",]
SOBuksub <- studyBuksub[studyBuksub$SchoolCatID == "Survey Only",]

# Remove unnecessary objects
rm(GEF_B, GEF_E, cbps,
   parentB, parentE, parentBuk, 
   primary, studentB, studentE, 
   studentBuk, EEF_B_qs, EEF_E_qs, 
   GEF_B_qs, GEF_E_qs, repvars)

### Save cleaned data files
#write.csv(parentBuk, file = "MergeMain.csv")
#write.csv(studyBuk, file = "MergeParentStudent.csv")

### School level averages
# Calculate school EEF score averages for all parents, and low eff base subset parents
# EEFscoreave_B <- tapply(studyBuk$EEFscore_B, studyBuk$SchoolID, mean, na.rm = T)
# EEFscoreave_E <- tapply(studyBuk$EEFscore_E, studyBuk$SchoolID, mean, na.rm = T)
# EEFscoreave_change <- EEFscoreave_E - EEFscoreave_B
# 
# # calculate for lower baseline efficacy subset (under 4)
# EEFscoreave_B.sub <- tapply(studyBuksub$EEFscore_B, studyBuksub$SchoolID, mean, na.rm = T)
# EEFscoreave_E.sub <- tapply(studyBuksub$EEFscore_E, studyBuksub$SchoolID, mean, na.rm = T)
# EEFscoreave_change.sub <- EEFscoreave_E.sub - EEFscoreave_B.sub
# 
# schoolEEFscores <- as.data.frame(cbind(
#                     SchoolID = names(EEFscoreave_B.sub),
#                     EEFscoreave_B,
#                     EEFscoreave_E,
#                     EEFscoreave_change,
#                     EEFscoreave_B.sub,
#                     EEFscoreave_E.sub,
#                     EEFscoreave_change.sub
#                     ))
# 
# Bukstudyschoolsscores <- merge(schoolEEFscores, Bukstudyschools, 
#                             by.x = "SchoolID", by.y = "NAME")
# 
# Bukstudyschoolsscores <- merge(Bukstudyschoolsscores, Bukteacherrabsences, by = "SchoolID")
